fillcolor = "#D83434"
file_name <- 'yelp_academic_dataset_review.json'
review<-jsonlite::stream_in(textConnection(readLines(file_name, n=280984)), flatten = TRUE)
summary(review)
## review_id user_id business_id stars
## Length:280984 Length:280984 Length:280984 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:3.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.838
## 3rd Qu.:5.000
## Max. :5.000
## useful funny cool text
## Min. : 0.000 Min. : 0.0000 Min. : 0.0000 Length:280984
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.0000 Class :character
## Median : 0.000 Median : 0.0000 Median : 0.0000 Mode :character
## Mean : 0.912 Mean : 0.2606 Mean : 0.3515
## 3rd Qu.: 1.000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :320.000 Max. :284.0000 Max. :79.0000
## date
## Length:280984
## Class :character
## Mode :character
##
##
##
ggplot(review, aes(x = stars)) +
geom_histogram(aes(color = stars, fill = fillcolor),binwidth = 1) +
xlab("Rating") +
ylab("Count") +
ggtitle("Count of Yelp Review Ratings") +
theme_classic() +
transition_states(stars, transition_length = 3, state_length = 1) +
shadow_mark(alpha = .3)
anim_save("ratinghistogram.gif")
file_name <- 'yelp_academic_dataset_business.json'
business<-jsonlite::stream_in(textConnection(readLines(file_name, n=280984)), flatten = TRUE)
summary(business)
## business_id name address city
## Length:150346 Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## state postal_code latitude longitude
## Length:150346 Length:150346 Min. :27.56 Min. :-120.10
## Class :character Class :character 1st Qu.:32.19 1st Qu.: -90.36
## Mode :character Mode :character Median :38.78 Median : -86.12
## Mean :36.67 Mean : -89.36
## 3rd Qu.:39.95 3rd Qu.: -75.42
## Max. :53.68 Max. : -73.20
## stars review_count is_open categories
## Min. :1.000 Min. : 5.00 Min. :0.0000 Length:150346
## 1st Qu.:3.000 1st Qu.: 8.00 1st Qu.:1.0000 Class :character
## Median :3.500 Median : 15.00 Median :1.0000 Mode :character
## Mean :3.597 Mean : 44.87 Mean :0.7962
## 3rd Qu.:4.500 3rd Qu.: 37.00 3rd Qu.:1.0000
## Max. :5.000 Max. :7568.00 Max. :1.0000
## attributes.ByAppointmentOnly attributes.BusinessAcceptsCreditCards
## Length:150346 Length:150346
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## attributes.BikeParking attributes.RestaurantsPriceRange2 attributes.CoatCheck
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.RestaurantsTakeOut attributes.RestaurantsDelivery
## Length:150346 Length:150346
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## attributes.Caters attributes.WiFi attributes.BusinessParking
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.WheelchairAccessible attributes.HappyHour attributes.OutdoorSeating
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.HasTV attributes.RestaurantsReservations attributes.DogsAllowed
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.Alcohol attributes.GoodForKids attributes.RestaurantsAttire
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.Ambience attributes.RestaurantsTableService
## Length:150346 Length:150346
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## attributes.RestaurantsGoodForGroups attributes.DriveThru attributes.NoiseLevel
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.GoodForMeal attributes.BusinessAcceptsBitcoin attributes.Smoking
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.Music attributes.GoodForDancing attributes.AcceptsInsurance
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.BestNights attributes.BYOB attributes.Corkage
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.BYOBCorkage attributes.HairSpecializesIn attributes.Open24Hours
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## attributes.RestaurantsCounterService attributes.AgesAllowed
## Length:150346 Length:150346
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## attributes.DietaryRestrictions hours.Monday hours.Tuesday
## Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## hours.Wednesday hours.Thursday hours.Friday hours.Saturday
## Length:150346 Length:150346 Length:150346 Length:150346
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## hours.Sunday
## Length:150346
## Class :character
## Mode :character
##
##
##
business<-business[1:12]
business
business_id <chr> | name <chr> | ||
|---|---|---|---|
| 1 | Pns2l4eNsfO8kk83dixA6A | Abby Rappoport, LAC, CMQ | |
| 2 | mpf3x-BjTdTEA3yCZrAYPw | The UPS Store | |
| 3 | tUFrWirKiKi_TAnsVWINQQ | Target | |
| 4 | MTSW4McQd7CbVtyjqoe9mw | St Honore Pastries | |
| 5 | mWMc6_wTdE0EUBKIGXDVfA | Perkiomen Valley Brewery | |
| 6 | CF33F8-E6oudUQ46HnavjQ | Sonic Drive-In | |
| 7 | n_0UpQx1hsNbnPUSlodU8w | Famous Footwear | |
| 8 | qkRM_2X51Yqxk3btlwAQIg | Temple Beth-El | |
| 9 | k0hlBqXX-Bt0vf1op7Jr1w | Tsevi's Pub And Grill | |
| 10 | bBDDEgkFA1Otx9Lfe7BZUQ | Sonic Drive-In |
business_wide<-business%>%
mutate(categories = strsplit(categories, ", ")) %>%
unnest(categories) %>%
arrange(categories) %>%
pivot_wider(names_from = categories,
names_prefix = "categories_",
names_repair = "universal",
values_from = categories,
values_fill = 0,
values_fn = length)
categories = str_split(business$categories,";")
categories = as.data.frame(unlist(categories))
colnames(categories) = c("Name")
categories %>%
group_by(Name) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup() %>%
mutate(Name = reorder(Name,Count)) %>%
head(10) %>%
ggplot(aes(x = Name,y = Count)) +
geom_bar(stat='identity',colour="white", fill =fillcolor) +
labs(x = 'Name of Category', y = 'Count',
title = 'Top 10 Categories of Business') +
coord_flip() +
theme_classic()
restaurants <- business[grepl('Restaurant',business$categories),]
dim(restaurants)
## [1] 52286 12
#categories in restuarant
business %>%
select(categories) %>%
filter(str_detect(categories, "Restaurant"))
categories <chr> |
|---|
| Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries |
| Burgers, Fast Food, Sandwiches, Food, Ice Cream & Frozen Yogurt, Restaurants |
| Pubs, Restaurants, Italian, Bars, American (Traditional), Nightlife, Greek |
| Ice Cream & Frozen Yogurt, Fast Food, Burgers, Restaurants, Food |
| Vietnamese, Food, Restaurants, Food Trucks |
| American (Traditional), Restaurants, Diners, Breakfast & Brunch |
| Food, Delis, Italian, Bakeries, Restaurants |
| Sushi Bars, Restaurants, Japanese |
| Korean, Restaurants |
| Coffee & Tea, Food, Cafes, Bars, Wine Bars, Restaurants, Nightlife |
business %>%
type_convert(cols(stars = col_double()))%>%
select(state,stars) %>%
group_by(state)%>%
summarize(Stars=median(stars))%>%
arrange(desc(Stars))%>%
head(10)
state <chr> | Stars <dbl> | |||
|---|---|---|---|---|
| MT | 5.00 | |||
| SD | 4.50 | |||
| UT | 4.50 | |||
| VT | 4.50 | |||
| HI | 4.25 | |||
| CA | 4.00 | |||
| CO | 4.00 | |||
| FL | 4.00 | |||
| ID | 4.00 | |||
| IN | 4.00 |
#Finding out which business has most reviews in dataset
maxreviews<-review %>%
group_by(business_id) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(5) %>%
inner_join(business)
## Joining with `by = join_by(business_id)`
maxreviews
business_id <chr> | Count <int> | name <chr> | address <chr> | city <chr> | |
|---|---|---|---|---|---|
| GBTPC53ZrG1ZBY3DT8Mbcw | 2609 | Luke | 333 Saint Charles Ave | New Orleans | |
| PY9GRfzr4nTZeINf346QOw | 1272 | Peppermill Reno | 2707 S Virginia St | Reno | |
| W4ZEKkva9HpAdZG88juwyQ | 1182 | Mr. B's Bistro | 201 Royal St | New Orleans | |
| vN6v8m4DO45Z4pp8yxxF_w | 1177 | Surrey's Café & Juice Bar | 1418 Magazine St | New Orleans | |
| SZU9c8V2GuREDN5KgyHFJw | 1067 | Santa Barbara Shellfish Company | 230 Stearns Wharf | Santa Barbara |
##Show the establishments with the most number of 5 star reviews (top 5)
review %>%
filter(stars == 5) %>%
group_by(business_id) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(5)%>%
inner_join(business_wide)
## Joining with `by = join_by(business_id)`
business_id <chr> | Count <int> | name <chr> | address <chr> | city <chr> | state <chr> | |
|---|---|---|---|---|---|---|
| GBTPC53ZrG1ZBY3DT8Mbcw | 1254 | Luke | 333 Saint Charles Ave | New Orleans | LA | |
| vN6v8m4DO45Z4pp8yxxF_w | 706 | Surrey's Café & Juice Bar | 1418 Magazine St | New Orleans | LA | |
| 8uF-bhJFgT4Tn6DTb27viA | 653 | District Donuts Sliders Brew | 2209 Magazine St | New Orleans | LA | |
| W4ZEKkva9HpAdZG88juwyQ | 625 | Mr. B's Bistro | 201 Royal St | New Orleans | LA | |
| UCMSWPqzXjd7QHq7v8PJjQ | 608 | Prep & Pastry | 2660 N Campbell Ave | Tucson | AZ |
lukebusiness=business %>%
filter(name=="Luke")
lukeJoined_tbl <-tibble( inner_join(lukebusiness,review))
## Joining with `by = join_by(business_id, stars)`
lukeDateFormatted<-lukeJoined_tbl%>%
mutate(date_formatted = as_date(date),
month_formatted=month(date),
day_formatted=day(date),
year_formatted=year(date),
hour_formatted=hour(date))
lukeDateFormatted%>%
select(year_formatted)%>%
group_by (year_formatted)%>%
summarise(NumberofReviews = n())%>%
ggplot(aes (x=year_formatted,y=NumberofReviews,color=fillcolor)) +
xlab("Year") +
ylab("Review Count") +
ggtitle("Yearly review count for Luke") +
geom_line() +
scale_x_continuous(breaks = seq(2008, 2018, 1),
labels = seq(2008, 2018, 1)) +
theme_classic() +
transition_reveal(year_formatted)
## `geom_line()`: Each group consists of only one observation.
## i Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## i Do you need to adjust the group aesthetic?
anim_save("lukenoreviewsyear.gif")
createWordCloud = function(x)
{
lukeDateFormatted %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word, !word %in% "luke" & !word %in% "restaurant" & !word %in% "menu") %>%
count(word,sort = TRUE) %>%
ungroup() %>%
head(30) %>%
with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}
createWordCloud(review)
positiveWordsBarGraph <- function(SC) {
contributions <- SC %>%
unnest_tokens(word, text) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(contribution = sum(value), n=n())
contributions %>%
top_n(20, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
head(20) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() + theme_bw()
}
positiveWordsBarGraph(lukeJoined_tbl)